#! /usr/bin/env python
# -*- coding: utf8 -*-
#
#@auther www
#@date 2014-06-23
#
#crawl xieetuan.com
#
#

from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from gaodeha_crawl.items import PostItem
from gaodeha_crawl.common.db_utils import DbUtils 
from gaodeha_crawl.common.constant import *
from scrapy import log
import time
import datetime
import re
import random

class XieetuanSpider(CrawlSpider):
    name = 'xieetuan'
    allowed_domains = ['xieetuan.com']
    start_urls = [
            "http://www.xieetuan.com/xieemanhua/",
            "http://www.xieetuan.com/mayifengmi/",
            "http://www.xieetuan.com/guafusandai/",
            "http://www.xieetuan.com/menglongyzhuan/",
            "http://www.xieetuan.com/renyvgongzhu/",
            "http://www.xieetuan.com/lezhangburu/",
            "http://www.xieetuan.com/dabikong/",
            #"http://www.xieetuan.com/gaogenxie/",
            #"http://www.xieetuan.com/xinxianmangguo/",
            #"http://www.xieetuan.com/bangbangbing/",
            #"http://www.xieetuan.com/DaHeJiangJun/",
            #"http://www.xieetuan.com/tianluoguniang/",
            #"http://www.xieetuan.com/sifenzhongliaoli/",
            #"http://www.xieetuan.com/gagamaga/",
            "http://www.xieetuan.com/sexiaozu/",
            #"http://www.xieetuan.com/woseqige/",
            #"http://www.xieetuan.com/pisalianlianqu/",
            #"http://www.xieetuan.com/WoTaiTaiShiNvGaoZhongSheng/",
            #"http://www.xieetuan.com/FuQiChengZhangRiJi/",
            #"http://www.xieetuan.com/MeiMeiYouDianGuai/",
            #"http://www.xieetuan.com/ShaoNianXiongHuaiDaZhi/",
            #"http://www.xieetuan.com/YuJieJinXingShi/",
            #"http://www.xieetuan.com/JiXieNvPu/",
            #"http://www.xieetuan.com/WoYuEMoDeHShengHuo/",
            #"http://www.xieetuan.com/JingBaoCaoMei/",
            #"http://www.xieetuan.com/ShaoNvPaiBie/",
            #"http://www.xieetuan.com/JinPingMei/",
            #"http://www.xieetuan.com/YaoJingDeWeiBa/",
            #"http://www.xieetuan.com/QiuSeZhiKong/",
            #"http://www.xieetuan.com/WoDeHuXianNvYou/",
            #"http://www.xieetuan.com/QinWenJieJie/",
            #"http://www.xieetuan.com/RiZaiXiaoYuan/",
            #"http://www.xieetuan.com/qigongzhu/",
            #"http://www.xieetuan.com/chiguoguo/",
            #"http://www.xieetuan.com/jingqixiansheng/",
            #"http://www.xieetuan.com/shixiong/",
            #"http://www.xieetuan.com/xiyouriji/",
            #"http://www.xieetuan.com/xiaochunjie/",
            #"http://www.xieetuan.com/xiaomingxilie/",
            #"http://www.xieetuan.com/juhuabaodian/",
            #"http://www.xieetuan.com/yibanbuchedan/",
            #"http://www.xieetuan.com/fankongjingying/",
            #"http://www.xieetuan.com/moshoushijie/",
            ]
    max_brief_index = 255
    category_urls = {
            "http://www.xieetuan.com/xieemanhua/":{
                    "category_id": "102000",
                    "category_name": "邪恶漫画",
            },
            "http://www.xieetuan.com/mayifengmi/":{
                    "category_id": "102001",
                    "category_name": "蚂蚁蜂蜜",
            },
            "http://www.xieetuan.com/guafusandai/":{
                    "category_id": "102002",
                    "category_name": "寡妇三代",
            },
            "http://www.xieetuan.com/menglongyzhuan/": {
                    "category_id": "102003",
                    "category_name": "梦龙Y传",
            },
            "http://www.xieetuan.com/renyvgongzhu/":{
                    "category_id": "102004",
                    "category_name": "人鱼公主",
            },
            "http://www.xieetuan.com/lezhangburu/":{
                    "category_id": "102005",
                    "category_name": "乐张不入",
            },
            "http://www.xieetuan.com/dabikong/":{
                    "category_id": "102006",
                    "category_name": "大鼻孔",
            },
            "http://www.xieetuan.com/gaogenxie/":{

                    "category_id": "102007",
                    "category_name": "高跟鞋",
            },
            "http://www.xieetuan.com/xinxianmangguo/":{

                    "category_id": "102008",
                    "category_name": "新鲜的芒果",
            },
            "http://www.xieetuan.com/bangbangbing/":{

                    "category_id": "102009",
                    "category_name": "棒棒冰",
            },
            "http://www.xieetuan.com/DaHeJiangJun/":{

                    "category_id": "102010",
                    "category_name": "大喝将军",
            },
            "http://www.xieetuan.com/tianluoguniang/":{

                    "category_id": "102011",
                    "category_name": "田螺姑娘",
            },
            "http://www.xieetuan.com/sifenzhongliaoli/":{

                    "category_id": "102012",
                    "category_name": "4分钟料理",
            },
            "http://www.xieetuan.com/gagamaga/":{

                    "category_id": "102013",
                    "category_name": "GAGAMAGA",
            },
            "http://www.xieetuan.com/sexiaozu/":{

                    "category_id": "102014",
                    "category_name": "色小组",
            },
            "http://www.xieetuan.com/woseqige/":{

                    "category_id": "102015",
                    "category_name": "我色其歌",
            },
            "http://www.xieetuan.com/pisalianlianqu/":{

                    "category_id": "102016",
                    "category_name": "披萨恋恋曲",
            },
            "http://www.xieetuan.com/WoTaiTaiShiNvGaoZhongSheng/":{

                    "category_id": "102017",
                    "category_name": "我太太是女高中生",
            },
            "http://www.xieetuan.com/FuQiChengZhangRiJi/":{

                    "category_id": "102018",
                    "category_name": "夫妻成长日记",
            },
            "http://www.xieetuan.com/MeiMeiYouDianGuai/":{

                    "category_id": "102019",
                    "category_name": "最近我的妹妹有点怪",
            },
            "http://www.xieetuan.com/ShaoNianXiongHuaiDaZhi/":{

                    "category_id": "102020",
                    "category_name": "少年啊！要胸怀大志！",
            },
            "http://www.xieetuan.com/YuJieJinXingShi/":{

                    "category_id": "102021",
                    "category_name": "御姐进行时",
            },
            "http://www.xieetuan.com/JiXieNvPu/":{

                    "category_id": "102022",
                    "category_name": "机械女仆",
            },
            "http://www.xieetuan.com/WoYuEMoDeHShengHuo/":{

                    "category_id": "102023",
                    "category_name": "我与恶魔的H生活",
            },
            "http://www.xieetuan.com/JingBaoCaoMei/":{

                    "category_id": "102024",
                    "category_name": "惊爆草莓",
            },
            "http://www.xieetuan.com/ShaoNvPaiBie/":{

                    "category_id": "102025",
                    "category_name": "少女派别",
            },
            "http://www.xieetuan.com/JinPingMei/":{

                    "category_id": "102026",
                    "category_name": "金瓶梅",
            },
            "http://www.xieetuan.com/YaoJingDeWeiBa/":{

                    "category_id": "102027",
                    "category_name": "妖精的尾巴",
            },
            "http://www.xieetuan.com/QiuSeZhiKong/":{

                    "category_id": "102028",
                    "category_name": "秋色之空",
            },
            "http://www.xieetuan.com/WoDeHuXianNvYou/":{

                    "category_id": "102029",
                    "category_name": "我的狐仙女友",
            },
            "http://www.xieetuan.com/QinWenJieJie/":{

                    "category_id": "102030",
                    "category_name": "亲吻姐姐",
            },
            "http://www.xieetuan.com/RiZaiXiaoYuan/":{

                    "category_id": "102031",
                    "category_name": "日在校园",
            },
            "http://www.xieetuan.com/qigongzhu/":{

                    "category_id": "102032",
                    "category_name": "七公主漫画",
            },
            "http://www.xieetuan.com/chiguoguo/":{

                    "category_id": "102033",
                    "category_name": "吃果果大作战",
            },
            "http://www.xieetuan.com/jingqixiansheng/":{

                    "category_id": "102034",
                    "category_name": "中国惊奇先生",
            },
            #"http://www.xieetuan.com/shixiong/":{
#
 #                   "category_id": "102035",
 #                   "category_name": "尸兄",
 #           },
            "http://www.xieetuan.com/xiyouriji/":{

                    "category_id": "102036",
                    "category_name": "西游日记",
            },
            "http://www.xieetuan.com/xiaochunjie/":{

                    "category_id": "102037",
                    "category_name": "请叫我小纯洁",
            },
            "http://www.xieetuan.com/xiaomingxilie/":{

                    "category_id": "102038",
                    "category_name": "小明系列",
            },
            "http://www.xieetuan.com/juhuabaodian/":{

                    "category_id": "102039",
                    "category_name": "菊花笑典",
            },
            "http://www.xieetuan.com/yibanbuchedan/":{

                    "category_id": "102040",
                    "category_name": "一般不扯淡",
            },
            "http://www.xieetuan.com/fankongjingying/":{

                    "category_id": "102041",
                    "category_name": "反恐精英",
            },
            "http://www.xieetuan.com/moshoushijie/":{

                    "category_id": "102042",
                    "category_name": "魔兽世界",
            }
        }
    domain = "http://www.xieetuan.com"
    rules = (
            Rule(SgmlLinkExtractor(allow=[s[len(domain):] + "list.*\.html" for s in start_urls], allow_domains=['xieetuan.com']), follow=True),
            Rule(SgmlLinkExtractor(allow=[s[len(domain):] + "\d*\.html" for s in start_urls], allow_domains=['xieetuan.com']), callback='parse_item'),
            )


    def __init__(self):
        CrawlSpider.__init__(self)
        self.db = DbUtils()
        sql = "select category_id, category_name from tb_category where category_id=%s"
        for url in self.category_urls.keys():
            category_value = self.category_urls[url]
            category = self.db.fetchOneDict(sql, (category_value["category_id"],))
            if category:
                #self.category_urls[url] = category
                pass
            else:
                #self.start_urls.remove(url)
                tsql = "insert into tb_category (category_id, category_name) values (%s, %s)"
                self.db.executeSql(tsql, (category_value["category_id"], category_value["category_name"]))
        self.rules = (
                 Rule(SgmlLinkExtractor(allow=[s[len(self.domain):] + "list.*\.html" for s in self.start_urls], allow_domains=['xieetuan.com']), follow=True),
                 Rule(SgmlLinkExtractor(allow=[s[len(self.domain):] + "\d*\.html" for s in self.start_urls], allow_domains=['xieetuan.com']), callback='parse_item'),
        )


    def parse_item(self, response):
        self.log("crawl parse url:" + response.url, level=log.DEBUG)
        item = PostItem()
        selector = Selector(response)
        postTitle = selector.xpath('//h1/text()').extract()
        item['post_title'] = postTitle[0]
        item['post_content'] = self.parse_item_content(selector)
        item['post_type'] = POST_TYPE_IMAGE
        item['post_author'] = self.parse_item_author(response.url)
        item['post_status'] = POST_STATUS_CHECK
        item['post_brief'] = ""
        item['comment_count'] = 0
        item['praise_count'] = 0
        item['hate_count'] = 0
        item['collect_count'] = 0
        item['share_count'] = 0
        item['post_category_id'] = self.parse_item_category_id(response.url)
        item['post_category_name'] = self.parse_item_category_name(response.url)
        item['post_origin_url'] = response.url
        item['post_origin_sitename'] = '邪恶团'
        item['post_uuid'] = response.url
        item['post_insert_time'] = int(time.time())
        item['post_show_time'] = self.parse_item_time(selector)
        item['post_update_time'] = int(time.time())
        item['post_font_images'] = ''
        item['post_vedio_url'] = ''
        item['post_tags'] = ''
        item['top_order'] = self.parse_item_order(response.url)
        return item

    def parse_item_content(self, selector):
        imgs = selector.xpath('//li[@id="imgshow"]/img').extract()
        if not imgs:
            return ""
        return "".join([img for img in imgs])
        

    def parse_item_time(self, selector):
        postMeta = selector.xpath('//div[@class="ryou"]/text()').extract()
        p = re.compile('\\d{4}-\\d{2}-\\d{2}')
        m = p.search(postMeta[0])
        post_show_time = int(time.time())
        if m:
            timeStr = m.group(0)
            post_show_time = long(time.mktime(datetime.datetime.strptime(timeStr, '%Y-%m-%d').timetuple()))
        return post_show_time


    def parse_item_author(self, url):
        return "小蘑菇"

    def parse_item_category_id(self, url):
        for crawlUrl in self.category_urls:
            if url.startswith(crawlUrl):
                return self.category_urls[crawlUrl]["category_id"]
        log.msg("crawl error url:" + url, level=log.ERROR)
        raise Exception("crawl error url:" + url)

    def parse_item_category_name(self, url):
        for crawlUrl in self.category_urls:
            if url.startswith(crawlUrl):
                return self.category_urls[crawlUrl]["category_name"]
        log.msg("crawl error url:" + url, level=log.ERROR)
        raise Exception("crawl error url:" + url)

    def parse_item_order(self, url):
        pattern = "/(\d+).html"
        m = re.search(pattern, url)
        if m:
            return int(m.group(1))
        return 100



